New York City Flights Data Playground

Dataset

flights
arrange(flights, year, desc(month), day)

Take a simple summary of this dataset:

str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame':    336776 obs. of  19 variables:
##  $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int  517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int  830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int  819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num  11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr  "UA" "UA" "AA" "B6" ...
##  $ flight        : int  1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr  "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr  "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr  "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num  1400 1416 1089 1576 762 ...
##  $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

Common Variable Types

Common Variable Types in DataFrame:

  • int stands for integers
  • dbl stands for doubles or real numbers
  • chr stands for character vectors, where strings are included as well
  • dttm is the abbreviation for data-times
  • date stands for dates
  • lgl stands for logical or vectors that contain TRUE or FALSE
  • fctr stands for factors, which R uses to represent categorical variables with fixed possible values

dplyr Functions

# Select all flights on Jan 1st.
filter(flights, month == 1, day == 1)
# Select all flights in Nov or Dec.
filter(flights, month == 11 | month == 12) # Pay attention to the `|` here, it is not `||`
filter(flights, month %in% c(9, 10))        # Select all flights in Sep or Oct.
filter(flights, between(dep_time, 0, 600))  # Departed between midnight and 6am
cat("[ sqrt(2) ^ 2 == 2 ] ->", sqrt(2) ^ 2 == 2)
## [ sqrt(2) ^ 2 == 2 ] -> FALSE
cat("\n[ near(sqrt(2) ^ 2, 2) ] ->", near(sqrt(2) ^ 2, 2))
## 
## [ near(sqrt(2) ^ 2, 2) ] -> TRUE
cat("\n[ 1/49 * 49 == 1 ] ->", 1/49 * 49 == 1)
## 
## [ 1/49 * 49 == 1 ] -> FALSE
cat("\n[ near(1/49 * 49, 1) ] ->", near(1/49 * 49, 1))
## 
## [ near(1/49 * 49, 1) ] -> TRUE
# Pay attention to the `|` here, which isn't `||`
all(
  filter(flights, !(arr_delay > 120 | dep_delay > 120)) ==
    filter(flights, arr_delay <= 120, dep_delay <= 120)
)
## [1] TRUE
NA
## [1] NA
NA > 33
## [1] NA
NA - 2
## [1] NA
NA ^ NA
## [1] NA
is.na(NA)    # A recommended way to determine whether a value is missing
## [1] TRUE

NA will make almost any operation unknown.

select(flights, year:day)        # Same with the below
# select(flights, year, month, day)
select(flights, -(year: day))    # Select all colums except year, month and day
select(flights, starts_with("dep_"))
select(flights, contains("dep_"))
select(flights, contains("dep_"), everything()) # All dep fields moved to the start of the dataframe
select(flights, 出発時間 = dep_time, 予定出発 = sched_dep_time, 出発遅延 = dep_delay)
# Slight Difference between `select` and `rename`
rename(flights, 出発時間 = dep_time, 予定出発 = sched_dep_time, 出発遅延 = dep_delay)
mutate(
  select(flights, year:day, ends_with("delay"), distance, air_time),
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60,
  gain_per_hour = gain / air_time * 60 # Refer to the newly created column
)
transmute(
  flights,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60,
  gain_per_hour = gain / air_time * 60 # Refer to the newly created column
)   # which will drop all original variables in the table
summarise(flights, delay = mean(dep_delay, na.rm = TRUE), count = n())
summarise(
  group_by(flights, year, month, day),
  delay = mean(dep_delay, na.rm = TRUE),
  count = n()
)
# (delay_dataset <- summarise(
#   group_by(flights, dest),
#   count = n(),
#   dist = mean(distance, na.rm = TRUE),
#   delay = mean(arr_delay, na.rm = TRUE)
# ))
# (delay_filtered <- filter(delay_dataset, count > 20, dest != "HNL"))
#
# Use Pipe to make the transformation above concise:
(delays <- flights %>%
   group_by(dest) %>%
   summarise(
     count = n(),
     dist = mean(distance, na.rm = TRUE),
     delay = mean(arr_delay, na.rm = TRUE)
   ) %>%
   filter(count > 20, dest != "HNL")
)
ggplot(delays, aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Visualize the density through count
(delays <- flights
  %>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
  %>% group_by(tailnum)
  %>% summarise(
    delay = mean(arr_delay),
    count = n()
  )
)
filter(delays, count > 25)
delays %>%
  ggplot(mapping = aes(x = delay)) +
  geom_freqpoly(binwidth = 10) +
  xlab("delay [min]")

delays %>%
  # filter(count > 25) %>%     # Filter to few flights to see variation
  ggplot(mapping = aes(x = delay, y = count)) +
  geom_point(alpha = 1/8) +    # Use the stastistic through count
  xlab("delay [min]")

# Estimation Upon Average Delay among NYC-Flights
(
  flights
    %>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
    %>% group_by(year, month, day)
    %>% summarise(
      avg_delay_for_all = mean(arr_delay),
      avg_delay_for_positive_part = mean(arr_delay[arr_delay > 0])
    )
    %>% mutate(
      date_field = ISOdate(year, month, day) # Attach date type variable
    )
    %>% ggplot()
          + geom_point(
            mapping = aes(x = date_field, y = avg_delay_for_all, colour = "DelayForAll")
          )
          + geom_point(
            mapping = aes(x = date_field, y = avg_delay_for_positive_part, colour = "DelayForPositive")
          )
          + scale_color_manual(
            # Set the order in the legend
            breaks = c("DelayForPositive", "DelayForAll"),
            values = c("DelayForAll" = "brown", "DelayForPositive" = "violetred1")
          )
)

(
  flights
    %>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
    %>% group_by(dest)
    %>% summarise(
      distanceAvg = mean(distance),
      min = min(distance),
      max = max(distance),
      median = median(distance),
      standardDeviation = sd(distance),
      interquartileRange = IQR(distance),
      interquartileRange_0 = quantile(distance, 0.75) - quantile(distance, 0.25),
      medianAbsoluteDeviation = mad(distance),
      prove_quantile1 = mean(distance < quantile(distance, 0.25)) # Get a Proportion
    )
    %>% arrange(desc(standardDeviation))
)

Measures of spread:

  • Mean Squared Deviation: \(\frac{\sum (x-\mu)^2}{N}\)
    • \(N\) = Number of Data
  • Standard Deviation(Root Mean Squared Deviation): \(\sigma = \sqrt{\frac{\sum (x-\mu)^2}{N}}\)
    • \(N\) = Number of Data
  • Median Absolute Deviation:

    \(MAD(D) = median({|d_i - median(D)|})\)

(
  flights
  %>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
  %>% group_by(dest)
  %>% summarise(
    count = n(),
    carriers = n_distinct(carrier), # Same as the below
    # carriers_0 = length(unique(carrier)),
    carriers_list = paste(unique(carrier), collapse = ", ")
  )
  %>% arrange(desc(carriers))
)

Summary Fist and Last Flights each day

Departure time only of first and last flights each day

(
  flights
  %>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
  %>% group_by(year, month, day)
  %>% summarise(
    first_dep_a = min(dep_time),
    first_dep_b = first(dep_time),
    last_dep_a = max(dep_time),
    last_dep_b = last(dep_time),
    n_early = sum(dep_time < 500)   # Count flights left before 5am
  )
)

Details of first and last flights each day

(
  flights
  %>% filter(!is.na(dep_delay),!is.na(arr_delay)) # Filter the flights not cancelled
  %>% group_by(year, month, day)
  %>% mutate(
    ranking = min_rank(desc(dep_time)),
    ranking_reverse = min_rank(dep_time),
    range_r = paste(range(ranking), collapse = " ~ ")
  )
  %>% filter(ranking %in% range(ranking))
  %>% arrange(year, month, day, ranking)
)

Summary Mileage Each Day

count(flights, year, month, day, wt = distance)
count(flights, year, month, day, tailnum, wt = distance)

Rolling up Summary

(daily <- group_by(flights, year, month, day))
(per_day <- summarise(daily, count = n()))
(per_month <- summarise(per_day, count = sum(count)))
(per_year <- summarise(per_month, count = sum(count)))
daily %>%
  ungroup() %>%
  summarise(flights = n())